* This file loads the data, defines variables and estimates variables common to different forecasts
* "modelmean" refers to the historical mean corresponding to a model, i.e. the one using the same estimation period
* "s..." indicates the use of simple returns rather than log returns 

clear




*Parameters for portfolio allocation
global gamma=3
global minall=0
global maxall=1.5


*Load data and set periods
if $freq==4{
	import excel using PredictorData2022.xlsx, firstrow clear sheet("Quarterly")
	destring _all, force replace
	sort yy

	global samplestart=19471
	global firstpred=19651
	global lastpred=20224
	global end1=20224-($h-1)
	global end2=19934
	global start2=19941
}
if $freq==12{
	import excel using PredictorData2022.xlsx, firstrow clear sheet("Monthly")
	destring _all, force replace
	sort yy
	global samplestart=194701
	global firstpred=196501
	global lastpred=202212
	global end1=202212-($h-1)
	global end2=199312
	global start2=199401
}


gen time=_n
tsset time

sum time if yy==$samplestart
global tsamplestart=r(mean)
sum time if yy==$firstpred
global tfirstpred=r(mean)
sum time if yy==$lastpred
global tlastpred=r(mean)
sum time if yy==$end1
global tend1=r(mean)
sum time if yy==$end2
global tend2=r(mean)
sum time if yy==$start2
global tstart2=r(mean)



*Define variables
gen dp=ln(D12/Index)
gen dy=ln(D12/Index[_n-1])
gen ep=ln(E12/Index)
gen dpay=ln(D12/E12)
gen stockvar=svar
gen booktomarket=bm
gen netequityexp=ntis
gen treasury=tbl
gen longtermyield=lty
gen longtermreturn=ltr
gen termspread=lty-tbl
gen defyieldspread=BAA-AAA
gen defratespread=corpr-ltr
if $freq==4{
	gen inflation=infl[_n-1]
	gen lastpredictor=ik
}

if $freq==12{
	gen lastpredictor=infl[_n-1]
}

*tbill for long analyis
gen tlong=tbl
replace tlong=Rfree[_n+1] if tlong==.


gen riskfree=Rfree
gen lnexret=(ln(1+CRSP_SPvw)-ln(1+riskfree))
gen ret=CRSP_SPvw

local i=1

*h-period average log premium is not computed from 1-period average because then it would not be nested in the regression-based forecasts
*contrary to the requirements of the Clark-West test
while `i'<$h{
	replace ret=(1+ret)*(1+CRSP_SPvw[_n+`i'])-1
	replace riskfree=(1+riskfree)*(1+Rfree[_n+`i'])-1
	replace lnexret=lnexret+(ln(1+CRSP_SPvw[_n+`i'])-ln(1+Rfree[_n+`i']))
	local i=`i'+1
}

gen exret=ret-riskfree




*Generate mean forecasts, variance, and trailing means
gen smodelvar=.
gen modelvar=.
gen smodelmean=.
gen modelmean=.
if $freq==4{
	gen svar11=svar+svar[_n-1]+svar[_n-2]+svar[_n-3]
}
if $freq==12{
	gen svar11=svar[_n-1]+svar[_n-2]+svar[_n-3]+svar[_n-4]+svar[_n-5]+svar[_n-6]+svar[_n-7]+svar[_n-8]+svar[_n-9]+svar[_n-10]+svar[_n-11]
}


forvalues i=$tsamplestart/$tlastpred{
		
		*The strict > rather than >=$samplestart means that the trailing mean will be nested
		*in the regressions because the first t for the dependent variable is sample start + 1 		
		
		sum lnexret if yy>$samplestart & time<`i'+1-$h
		replace modelmean=r(mean) if time==`i'
			
		sum exret if yy>$samplestart & time<`i'+1-$h
		replace smodelmean=r(mean) if time==`i'
		
		
		sum lnexret if time>=`i'-$freq*10 & time<`i'+1-$h
		replace modelvar=r(Var) if time==`i'
		
		sum exret if time>=`i'-$freq*10 & time<`i'+1-$h
		replace smodelvar=r(Var) if time==`i'
	

}

